# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
# Import train_test_split
from sklearn.cross_validation import train_test_split
from collections import defaultdict
from wordcloud import WordCloud
# magic word for producing visualizations in notebook
%matplotlib inline
df_playstore = pd.read_csv('googleplaystore.csv',delimiter=',')
df_reviews = pd.read_csv('googleplaystore_user_reviews.csv',delimiter=',')
df_playstore.head()
print (df_playstore.shape)
print (df_playstore.info())
print (df_playstore.columns)
for col in df_playstore.columns:
print (df_playstore[col].value_counts())
df_reviews.head()
df_reviews['Sentiment'].value_counts()
Define -
Data cleaning steps- df_playstore
1. Drop duplicates in 'App' column googleplaystore
2. Remove "M" & "k" from Size column. Convert value to MB and dtype to float."Varies with device" would be NaN. Replace 1,000+ with 1.
3. Remove "+" from Installs column. Convert to int.
4. Remove single row in playstore dataframe where Category = 1.9.
5. Replace zero 0 with Free in column Type. Convert to float
6. Remove $ sign from Price column and convert to float
7. Genres - split on ; keep second but remove the first part
8. Last Updated - change dtype to datetime and time
9. Current Ver - do nothing as of now.
Android Ver - split and remove " and up" string... If time allows.!
df_playstore.columns
# Clean data for App column
df_playstore.drop_duplicates(['App'],keep='first',inplace=True)
list(df_playstore.loc[df_playstore['Category'] == '1.9'].index)
# Drop row from Category column where df_playstore['Category'] == '1.9'
#Convert the index value to list then pass it to drop function.
df_playstore.drop(list(df_playstore.loc[df_playstore['Category'] == '1.9'].index),axis=0,inplace=True)
# Clean data for Size column
df_playstore['Size'].replace('Varies with device',np.nan,inplace=True)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if 'M' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Size'] = df_playstore['Size'].apply(lambda x: float(str(x).replace('k', '')) / 1000 if 'k' in str(x) else x)
df_playstore['Size'].replace('1,000','1',inplace=True)
df_playstore['Size'] = df_playstore['Size'].astype('float')
df_playstore['Size'].dtype
# Clean data for Installs column
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x: str(x)[:-1] if '+' in str(x) else x)
df_playstore['Installs'] = df_playstore['Installs'].map(lambda x : str(x).replace(',','') if ',' in str(x) else x)
df_playstore['Installs'].replace('Free','0',inplace=True)
df_playstore['Installs'] = df_playstore['Installs'].astype('int')
df_playstore['Installs'].dtype
# Clean data for Ratings column
df_playstore['Rating'] = df_playstore['Rating'].astype('float')
# Change dtype for Reviews column
df_playstore['Reviews'] = df_playstore['Reviews'].astype('int')
df_playstore['Reviews'].dtype
#Data cleaning and wrangling for Price column
df_playstore['Price'] = df_playstore['Price'].map(lambda x : str(x).replace('$','') if '$' in str(x) else x)
df_playstore.rename(columns={'Price':'Price_in_dollars'},inplace=True)
df_playstore['Price_in_dollars'] = df_playstore['Price_in_dollars'].astype('float')
print(df_playstore['Price_in_dollars'].dtype)
#Split values of Genres columns by ';' and group appropriatly
print (df_playstore['Genres'].shape)
df_playstore['Genres'] = df_playstore['Genres'].map(lambda x : str(x).rsplit(';')[0] if ';' in str(x) else x)
print (df_playstore['Genres'].shape[0])
print (df_playstore['Genres'].dtype)
# Change dtype of Last Updated to date time.
df_playstore['Last Updated'] = pd.to_datetime(df_playstore['Last Updated'],dayfirst=True)
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
#Fill NaN values with mean of respective columns.
#np.where(np.isnan(df_playstore))
df_playstore['Rating'].fillna(df_playstore['Rating'].mean(),inplace=True)
df_playstore['Size'].fillna(df_playstore['Size'].mean(),inplace=True)
missing_data = df_playstore.isnull().sum()
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
df_playstore.head()
#find and drop rows with missing values
missing_data = df_playstore.isnull().sum(axis=1)
missing_data = (missing_data[missing_data>0]/df_playstore.shape[0]) * 100
missing_data.sort_values(inplace=True)
missing_data.plot.bar(title = 'Column wise percentage missing', figsize=(6,3))
#print (df_playstore.iloc[1553])
#Scatterplot Matrix from seaborn
x = df_playstore['Rating'].dropna()
y = df_playstore['Size'].dropna()
z = df_playstore['Installs'][df_playstore.Installs!=0].dropna()
p = df_playstore['Reviews'][df_playstore.Reviews!=0].dropna()
t = df_playstore['Type'].dropna()
price = df_playstore['Price_in_dollars']
p = sns.pairplot(pd.DataFrame(list(zip(x, y, np.log(z), np.log10(p), t, price)),
columns=['Rating','Size', 'Installs', 'Reviews', 'Type', 'Price_in_dollars']), hue='Type', palette="Set2")
len(df_playstore['Category'].value_counts())
df_playstore['Category'].value_counts().plot.bar(title = 'Number of Categorical Apps', figsize=(30,20),fontsize=18)
Categories = df_playstore['Category'].unique()
Categories
df_playstore.head()
df_playstore.groupby('Category')
a = df_playstore['Rating'].dropna()
b = df_playstore['Installs'][df_playstore.Installs!=0].dropna()
sns.jointplot(x=df_playstore['Installs'],y=df_playstore['Rating'])
# getdummies encoding for Type column
df_playstore_temp=pd.get_dummies(df_playstore,prefix='Type',columns=['Type'])
values = np.array(df_playstore_temp['Category'].unique())
values
# #label encoding Categorical values
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(values)
# print (integer_encoded)
# #One hot encoding categorical values
# onehot_encoder = OneHotEncoder(sparse=False)
# integer_encoded = integer_encoded.reshape(len(integer_encoded),1)
# onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
# print (onehot_encoded)
# ##Inverse transform one hot encoded
# # inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0,:])])
# # print(inverted)
#Label encode Category column in dataframe
labelencoder = LabelEncoder()
df_playstore_temp['Category'] = labelencoder.fit_transform(df_playstore_temp['Category'])
df_playstore_temp['Content Rating'] = labelencoder.fit_transform(df_playstore_temp['Content Rating'])
df_playstore_temp.drop(columns=['App','Genres','Last Updated','Current Ver','Android Ver'],axis=0,inplace=True)
df_playstore_temp.head()
#fill-in column level mean values and replace nan uniformity in the dataframe.
df_playstore_temp.fillna(df_playstore_temp.mean(),inplace=True)
#Make all values numeric in dataframe before scaling
df_playstore_temp.apply(pd.to_numeric)
#find any nan values still present after replacing nan with column mean value.
np.where(np.isnan(df_playstore_temp))
# Apply feature scaling on all values to the entire numerical dataframe.
scaled_features = StandardScaler().fit_transform(df_playstore_temp.values)
df_scaled_features = pd.DataFrame(scaled_features,index=df_playstore_temp.index,columns = df_playstore_temp.columns)
df_scaled_features.describe()
#function to apply PCA feature scaling
def scree_plot(pca):
'''
Creates a scree plot associated with the principal components
INPUT: pca - the result of instantian of PCA in scikit learn
OUTPUT:
None
'''
num_components = len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(25, 10))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals)
ax.plot(ind, cumvals)
print (ind, cumvals)
for i in range(num_components):
ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
# Apply PCA to the data with for all features
pca = PCA(n_components=9)
pca_scaled_features = pca.fit_transform(scaled_features)
scree_plot(pca)
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
a1 = pca.components_[ix]
a2 = dataset.keys().values
a = list(zip(a1, a2))
a.sort(key=lambda tup: tup[0])
return a
sorted_weights(pca,1,df_scaled_features)
sorted_weights(pca,2,df_scaled_features)
sorted_weights(pca,3,df_scaled_features)
PCA Analysis results- NOTE -
PCA analysis shows that 8 components have variance between 23.4% to 4.15% which would affect the prediction of outcomes.
As thumb rule, positive or negative variance of component above 0.5 is deemed to affect the preduction. Hence in first set of features selection, Reviews and Installation numbers play a big role in predicting which category of apps would recieve reviews and higher number of installations.
In the second set of components, Category, Installs, Size and Content Rating variance is inverserly proportional. It implies that Size of App and number of installs are dependent and customers prefer to install low size apps.
Third set of components shows App Price and ratings have strong inverse relationship. Customer who pay who provide higher Ratings install apps which are cheaper.
KMean clustering of dataset shows that there are three clusters of data in majority.
def plot_data(data, labels):
'''
Plot data with colors associated with labels
'''
fig = plt.figure();
ax = Axes3D(fig)
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=labels, cmap='tab10');
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=15)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
#Evaluate unsupervised learning methods on the data with 8 clusters.
#Initiate KMeans and plot possible clusters
kmeans_pop = KMeans(n_clusters=7)
model_pop = kmeans_pop.fit(pca_scaled_features)
labels_pop = model_pop.predict(pca_scaled_features)
plot_data(pca_scaled_features,labels_pop)
df_playstore_temp.head()
Use Linear Regression model to predict the following-
1. Predict Ratings of apps for all categories.
2. Predict Pricing of apps.
3. Predict pricing in relation to number of installations
#Split the dataset into features and target labels. Rating column is the target label.
Target_label = df_playstore_temp['Rating']
features_label = df_playstore_temp.drop('Rating',axis=1)
# Split the features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_label,Target_label,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
#Train using LinearRegression model
lm_model = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model.fit(X_train, y_train)
except:
print("Oh no! It doesn't work!!!")
y_test_preds = lm_model.predict(X_test)# Predictions here
r2_test = r2_score(y_test, y_test_preds) # Rsquared here
# Print r2 to see result
print('R Squared value of the predicted labels: ' + str(r2_test))
coefficients = list(zip(lm_model.coef_,X_test))
#print(coefficients)
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test,y_test_preds)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test,y_test_preds)))
print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test,y_test_preds)))
# Linear regression coefficients for Ratings as target labels.
coefficients
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
plt.legend()
plt.title('Linear Regression model- App Ratings')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()
#Split the dataset into features and target labels. Rating column is the target label.
Target_label1 = df_scaled_features['Price_in_dollars']
features_label1 = df_scaled_features.drop('Price_in_dollars',axis=1)
# Split the features into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(features_label1,Target_label1,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train1.shape[0]))
print("Testing set has {} samples.".format(X_test1.shape[0]))
#Train using LinearRegression model
lm_model1 = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model1.fit(X_train1, y_train1)
except:
print("Oh no! It doesn't work!!!")
y_test_preds1 = lm_model1.predict(X_test1)# Predictions here
r2_test1 = r2_score(y_test1, y_test_preds1) # Rsquared here
# Print r2 to see result
coefficients1 = list(zip(lm_model1.coef_,X_test1))
print('R Squared value of the predicted labels: ' + str(r2_test1))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test1,y_test_preds1)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test1,y_test_preds1)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test1,y_test_preds1)))
# Linear regression coefficients for Price as target labels.
coefficients1
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
#sns.regplot(y_test_preds,y_test,color='teal',marker = 'x')
#sns.regplot(y_test_preds1,y_test1,color='orange')
sns.regplot(y_test1,y_test_preds1,color='teal',marker='x')
plt.legend()
plt.title('Linear Regression model- predicted price vs actual price')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.show()
#Split the dataset into features and target labels.'Installs'column is the target label.
Target_label2 = df_scaled_features['Installs']
features_label2 = df_scaled_features.drop('Installs',axis=1)
# Split the features into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(features_label2,Target_label2,test_size = 0.2,random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train2.shape[0]))
print("Testing set has {} samples.".format(X_test2.shape[0]))
#Train using LinearRegression model
lm_model2 = LinearRegression(normalize=True)
# If our model works, it should just fit our model to the data. Otherwise, it will let us know.
try:
lm_model2.fit(X_train2, y_train2)
except:
print("Oh no! It doesn't work!!!")
#Predict function
y_test_preds2 = lm_model2.predict(X_test2)# Predictions here
#R square function
r2_test2 = r2_score(y_test2, y_test_preds2) # Rsquared here
# Print r2 to see result
coefficients2 = list(zip(lm_model2.coef_,X_test2))
print('R Squared value of the predicted labels: ' + str(r2_test2))
print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_test2,y_test_preds2)))
print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_test2,y_test_preds2)))
#print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_test2,y_test_preds2)))
coefficients2
# Plot a chart for linear regression
plt.figure(figsize=(12,7))
sns.regplot(y_test_preds1,y_test1,color='teal',marker = 'x')
sns.regplot(y_test_preds2,y_test2,color='orange')
plt.legend()
plt.title('Linear Regression model- Predicted Price vs Installs')
plt.xlabel('Predicted Installs')
plt.ylabel('Predicted Price')
plt.show()
Predicted Price vs Predicted Installs
Linear Regression model shows that predicted price and number of installs are proportional and tends to increase together. (Small Blue line is predicted price and the large Orange line is predicted Installs). Form Violin plot at the end it would be clear that for certain categories of Apps on playstore, consumers pay for the App.
Apps in cateogories like Events, lifestyle, Games, Books and libraries have free to highest prices. Whereas Books and references, Dating, Finance, Health and Fitness, Gaming, Family, Medical apps have lower prices.
Apps in categories like Game, Lifestyle, Family, Medical, Travel and local, productivity, Health and fitness get paid for with higher number of installations.
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Price_in_dollars", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across all categories')
subset_df = df_playstore[df_playstore.Category.isin(df_playstore['Category'].unique())]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Installs", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App Installation trend across all categories')
Reviews and Installs have strong corelation which is rational in the given dataset.
#Corelation plot for numerical encoded version of playstore data.
sns.heatmap(df_playstore_temp.corr(),annot=True,fmt='.2f')
Histogram for scaled and numerical version Playstore dataset.
df_playstore_temp.hist(figsize=(15,10))
Apps which are paid for have got better rating and more number of ratings.
sns.jointplot(df_playstore['Price_in_dollars'],df_playstore['Rating'],color='r')
sns.jointplot(df_playstore['Installs'],df_playstore['Rating'],color='c')
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 50).reset_index()
print('Average rating = ', np.nanmean(list(groups.Rating)))
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]
layout = {'title' : 'App rating distribution and probability density for all categories',
'xaxis': {'tickangle':-40},
'yaxis': {'title': 'Rating'},
'plot_bgcolor': 'rgb(250,250,250)',
'shapes': [{
'type' :'line',
'x0': -.5,
'y0': np.nanmean(list(groups.Rating)),
'x1': 19,
'y1': np.nanmean(list(groups.Rating)),
'line': { 'dash': 'dashdot'}
}]
}
data = [{
'y': df_playstore.loc[df_playstore.Category==category]['Rating'],
'type':'violin',
'name' : category,
'showlegend':False,
#'marker': {'color': 'Set2'},
} for i,category in enumerate(list(set(groups.Category)))]
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Average rating for all App Categories
This chart shows median value of Ratings per Category together with Max Rating received and minimum Rating. This chart also shows combination of ratings distribution and probability density per category.
Tools, Productivity, Finance, Category- has range of ratings from 1 to 5
Assuming that Rating is equivalent to popularity, and width and length of curved area of the graph implies that Comics, Health & Fitness,Parenting, Art and Design,
Reference- https://en.wikipedia.org/wiki/Violin_plot https://en.wikipedia.org/wiki/Kernel_density_estimation http://seaborn.pydata.org/generated/seaborn.violinplot.html
groups = df_playstore.groupby('Category').filter(lambda x: len(x) >= 300).reset_index()
print('Average rating = ', np.nanmean(list(groups.Price_in_dollars)))
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 720, len(set(groups.Category)))]
layout = {'title' : 'App Price distribution and probability density for top 10 categories',
'xaxis': {'tickangle':-40},
'yaxis': {'title': 'Price_in_dollars'},
'plot_bgcolor': 'rgb(250,250,250)',
'shapes': [{
'type' :'line',
'x0': -.5,
'y0': np.nanmean(list(groups.Price_in_dollars)),
'x1': 19,
'y1': np.nanmean(list(groups.Price_in_dollars)),
'line': { 'dash': 'dashdot'}
}]
}
data = [{
'y': df_playstore.loc[df_playstore.Category==category]['Price_in_dollars'],
'type':'violin',
'name' : category,
'showlegend':False,
#'marker': {'color': 'Set2'},
} for i,category in enumerate(list(set(groups.Category)))]
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()
plotly.offline.iplot({'data': data, 'layout': layout})
Define-
1. What do users think about the Apps? Keywords?
2. Sentiment Analysis.
df_reviews.head()
A wordcloud of most frequently used words used in User reviews by the users.
df_reviews['Translated_Review'].replace(np.NaN,'',inplace=True)
review = np.array(df_reviews['Translated_Review'].dropna)
# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=100, background_color='white').generate(str(review))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()
A wordcloud of most frequently used words used for App names.
df_reviews['App'].replace(np.NaN,'',inplace=True)
Appname = np.array(df_reviews.drop_duplicates('App').dropna())
# Create and generate a word cloud image for first 50 dog names:
wordcloud = WordCloud(max_words=200, background_color='white').generate(str(Appname))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(figsize=(40,30))
plt.show()